---
title: "R Notebook"
output:
  html_document:
    df_print: paged
---




#Data

Loading data from the UNGDC data


```{r message=FALSE}
#Loading packages and data
library(readtext)
library(quanteda)
library(rworldmap)
library(RColorBrewer)
library(haven)
library(readxl)

library(tidyverse)
library(tidymodels)
library(rsample)

# Modelling packages
library(caret)
library(caretEnsemble)
library(earth)
library(xgboost)
library(ranger)
library(rpart)
library(rpart.plot)

# Model interpretability packages
library(vip)
library(pdp)
library(lime)
library(jtools)
```

```{r}
#NDC corpus is available as tarball from this Dataverse

ndc_files <- readtext("../NDC_corpus/*", 
                                 docvarsfrom = "filenames", docvarnames = "Country")


ndc_files$doc_id <- str_replace(ndc_files$doc_id , ".txt", "") 

```


```{r}
ndc_corpus <- corpus(ndc_files, text_field = "text") 

```


```{r}

corp_summary <- summarise(group_by(summary(ndc_corpus, n = 158), Country),
                           total_sentences=sum(Sentences),total_words=sum(Tokens))

readr::write_csv(corp_summary, "ndc_corp_summary.csv")

```

#Pre-processing

Tokenizing corpus.

```{r}
#Tokenization and basic pre-processing
tok <- tokens(ndc_corpus, what = "word",
              remove_punct = TRUE,
              remove_symbols = TRUE,
              remove_numbers = TRUE,
              remove_url = TRUE,
              split_hyphens = FALSE,
              verbose = TRUE)
```

Lowercasing and removing stopwords

```{r}
tok <- tokens_tolower(tok)
tok.r <- tokens_select(tok, stopwords("english"), selection = "remove", padding = FALSE)

```

#Setting up agreed dictionaries

Creating compound tokens from the key terms (phrases) in our dictionaries:
 
```{r}

mylist <- list( c("air", "pollution"), c("mental", "disorder"), c("mental", "disorders"), c("climate","change"), c("changing","climate"), c("climate","emergency"), c("climate","crisis"), c("climate","decay"), c("global","warming"), c("green","house"), c("extreme","weather"), c("global", "environmental", "change"), c("climate","variability"),  c("low","carbon"), c("renewable","energy"), c("carbon","emission"), c("carbon","emissions"), c("carbon","dioxide"), c("co2","emission"), c("co2","emissions"), c("climate","pollutant"), c("climate","pollutants"), c("carbon","neutral"), c("carbon","neutrality"), c("climate","neutrality"), c("climate","action"), c("net","zero")) 
 
```
 

```{r}
tok.compound <- tokens_compound(tok.r, mylist, valuetype = "fixed", concatenator = "_")
```


Creating the dictionary of climate change terms:

```{r}
climate_dict <- dictionary(list(climate =  c("climate_change", "changing_climate", "climate_emergency", "climate_crisis", "climate_decay", "global_warming", "green_house", "temperature", "extreme_weather", "global_environmental_change", "climate_variability", "greenhouse", "greenhouse-gas", "low_carbon", "ghge", "ghges", "renewable_energy", "carbon_emission", "carbon_emissions", "carbon_dioxide", "carbon-dioxide", "co2_emission", "co2_emissions", "climate_pollutant", "climate_pollutants", "decarbonization", "decarbonisation", "carbon_neutral", "carbon-neutral", "carbon_neutrality", "climate_neutrality", "climate_action", "net-zero", "net_zero"
)))

```

Creating the dictionary of health terms:

```{r}
health_dict <- dictionary(list(health = c("malaria", "diarrhoea", "infection", "disease", "diseases", "sars", "measles", "pneumonia", "epidemic", "epidemics", "pandemic", "pandemics", "epidemiology", "healthcare", "health", "mortality", "morbidity", "nutrition", "illness", "illnesses", "ncd", "ncds", "air_pollution", "nutrition", "malnutrition", "malnourishment", "mental_disorder", "mental_disorders", "stunting")))
```



## KWIC



```{r}
tok.hea <- kwic(tok.compound, health_dict, window = 25, valuetype = "fixed")

tok.cc <- kwic(tok.compound, climate_dict, window = 25, valuetype = "fixed")

```



```{r}
corpus_health <- corpus(tok.hea, split_context = FALSE, extract_keyword = TRUE)

tok.climate.kwic <- kwic(corpus_health, climate_dict, window = 25, valuetype = "fixed")

```



```{r}
health_dfm <- dfm(corpus_health)

corpus_climate <- corpus(tok.cc, split_context = FALSE, extract_keyword = TRUE)
climate_dfm <- dfm(corpus_climate)

corpus_intersection <- corpus(tok.climate.kwic, split_context = FALSE, extract_keyword = TRUE)
intersection_dfm <- dfm(corpus_intersection)

health <- convert(health_dfm, "data.frame")
climate <- convert(climate_dfm, "data.frame")
intersection <- convert(intersection_dfm, "data.frame")

names(health)[1] <- "docid"
names(climate)[1] <- "docid"
names(intersection)[1] <- "docid"

health$"docid" <- str_extract(health$docid, "^([^.]+)")
climate$"docid" <- str_extract(climate$docid, "^([^.]+)")
intersection$"docid" <- str_extract(intersection$docid, "^([^.]+)")

health_counts <- health %>% group_by(docid) %>% summarise(health_count = n())
climate_counts <- climate %>% group_by(docid) %>% summarise(climate_count = n())
intersection_counts <- intersection %>% group_by(docid) %>% summarise(intersection_count = n())

total_counts <- climate_counts %>% 
  full_join(health_counts, by = "docid") %>% 
  full_join(intersection_counts, by = "docid") %>%
  replace_na(list(health_count=0, climate_count=0, intersection_count=0)) %>%
  separate(docid, c("country", "year"), "_")

total_counts$year <- as.numeric(total_counts$year)

total_counts <- arrange(total_counts, country, year)

total_counts$year <- 2019

```


```{r}
readr::write_csv(total_counts, "total_counts_ndc.csv")
```

